library(readr)
library(plotly)
library(ggplot2)
library(highcharter)
library(ngram)
library(png)
library(stringr)
library(dplyr)
library("tm")
library("wordcloud")
library(corrplot)
library(arules)
library(arulesViz)
library(colorspace)
setwd("/Users/Apple/Documents/TaraFiles/University/term 8/Data Analysis/week 12/")
movie = read_delim("./movie/movies.dat",
delim = "::",col_names = c("MovieID",NA,"Title",NA,"Genres"))
movie = movie %>% select(1,3,5)
movie$MovieID=as.numeric(movie$MovieID)
tag = read_delim("./movie/tags.dat",
delim = "::",col_names = c("UserID",NA,"MovieID",NA,"Tag",NA,"Timestamp"))
tag = tag %>% select(1,3,5,7)
tag$MovieID=as.numeric(tag$MovieID)
rating = read_delim("./movie/ratings.dat",
delim = "::",col_names = c("UserID",NA,"MovieID",NA,"Rating",NA,"Timestamp"))
rating = rating %>% select(1,3,5,7)
rating$MovieID=as.numeric(rating$MovieID)
rating$Rating=as.numeric(rating$Rating)
# popular
rating%>%select(MovieID,Rating)%>%group_by(MovieID)%>%
summarise(meanRate=mean(Rating,na.rm = T))%>%arrange(-meanRate)->popularity
popularity$MovieID[1:5]->popularID
movie$Title[ which(movie$MovieID %in% popularID) ]
## [1] "Satan's Tango (Sátántangó) (1994)"
## [2] "Shadows of Forgotten Ancestors (1964)"
## [3] "Fighting Elegy (Kenka erejii) (1966)"
# number of comments
rating%>%select(MovieID,Rating)%>%group_by(MovieID)%>%
summarise(numComments=n())%>%arrange(-numComments)->num_of_Comments
num_of_Comments$MovieID[1:5]->num_of_CommentsID
movie$Title[ which(movie$MovieID %in% num_of_CommentsID) ]
## [1] "Pulp Fiction (1994)" "Shawshank Redemption, The (1994)"
## [3] "Forrest Gump (1994)" "Jurassic Park (1993)"
## [5] "Silence of the Lambs, The (1991)"
# least popular
popularity%>% arrange(meanRate)-> L_pop
L_pop$MovieID[1:3]->L_popID
movie$Title[ which(movie$MovieID %in% L_popID) ]
## [1] "Besotted (2001)" "Hi-Line, The (1999)"
# num of movie each year
movie%>%mutate(year=as.numeric(str_sub(Title,-5,-2)))->movie
movie%>%filter(!is.na(year))%>%group_by(year)%>%summarise(numMovie=n())->Movie_Year
Movie_Year=Movie_Year[-c(1:10),]
Movie_Year%>%hchart(type = "bar",hcaes(x = year, y = numMovie),name = "num of movie each year")%>%
hc_title(text = "num of movie each year")
# fav genre each year
movie%>%mutate(Action=as.numeric(str_detect(Genres,"Action")),
Adventure=as.numeric(str_detect(Genres,"Adventure")),
Animation=as.numeric(str_detect(Genres,"Animation")),
Children=as.numeric(str_detect(Genres,"Children")),
Comedy=as.numeric(str_detect(Genres,"Comedy")),
Crime=as.numeric(str_detect(Genres,"Crime")),
Documentary=as.numeric(str_detect(Genres,"Documentary")),
Drama=as.numeric(str_detect(Genres,"Drama")),
Fantasy=as.numeric(str_detect(Genres,"Fantasy")),
Film_Noir=as.numeric(str_detect(Genres,"Film-Noir")),
Horror=as.numeric(str_detect(Genres,"Horror")),
Mystery=as.numeric(str_detect(Genres,"Mystery")),
Romance=as.numeric(str_detect(Genres,"Romance")),
Sci_Fi=as.numeric(str_detect(Genres,"Sci-Fi")),
Thriller=as.numeric(str_detect(Genres,"Thriller")),
War=as.numeric(str_detect(Genres,"War")),
Western=as.numeric(str_detect(Genres,"Western")))->movie_with_genres
right_join(popularity,movie_with_genres,by="MovieID")->genre_movie_rate
genre_movie_rate%>%group_by(year)%>%na.omit()%>%summarise(Action=sum(Action*meanRate)/sum(Action),
Adventure=sum(Adventure*meanRate)/sum(Adventure),
Animation=sum(Animation*meanRate)/sum(Animation),
Children=sum(Children*meanRate)/sum(Children),
Comedy=sum(Comedy*meanRate)/sum(Comedy),
Crime=sum(Crime*meanRate)/sum(Crime),
Documentary=sum(Documentary*meanRate)/sum(Documentary),
Drama=sum(Drama*meanRate)/sum(Drama),
Fantasy=sum(Fantasy*meanRate)/sum(Fantasy),
Film_Noir=sum(Film_Noir*meanRate)/sum(Film_Noir),
Horror=sum(Horror*meanRate)/sum(Horror),
Mystery=sum(Mystery*meanRate)/sum(Mystery),
Romance=sum(Romance*meanRate)/sum(Romance),
Sci_Fi=sum(Sci_Fi*meanRate)/sum(Sci_Fi),
Thriller=sum(Thriller*meanRate)/sum(Thriller),
War=sum(War*meanRate)/sum(War),
Western=sum(Western*meanRate)/sum(Western))->movie_year_genres
movie_year_genres[is.na(movie_year_genres)] <-0
movie_year_genres[, "max"] <- do.call(pmax,movie_year_genres[ 2:17])
movie_year_genres$pop_genre=""
for (i in 1:dim(movie_year_genres)[1]) {
loc=which.max(movie_year_genres[i,2:17])
movie_year_genres$pop_genre[i]=names(movie_year_genres)[loc+1]
}
movie_year_genres%>%hchart(type = "bar",hcaes(x = year, y = max,name =pop_genre,group=pop_genre))%>%
hc_title(text = "pop_genre of each year")%>%
hc_subtitle("hold on the bar to see the name of the genre")
movie_year_genres%>%filter(year>1980) %>%hchart(type = "bar",hcaes(x = year, y = max,name =pop_genre,group=pop_genre))%>%
hc_title(text = "pop_genre of each year (after 1980)")%>%hc_subtitle("hold on the bar to see the name of the genre")
***
# num of movie per genre
genreName=c("Action","Adventure","Animation","Children's","Comedy","Crime",
"Documentary","Drama","Fantasy","Film-Noir","Horror",
"Mystery","Romance","Sci-Fi","Thriller","War","Western")
GenereInMovie=movie$Genres
strsplit(GenereInMovie,"\\|")%>%unlist()%>%table() %>%
as.data.frame(stringsAsFactors = F)->GenereInMovie
GenereInMovie=GenereInMovie[-c(1:19),]
names(GenereInMovie)=c("Genre","Freq")
GenereInMovie%>%arrange(Freq)%>%hchart(type = "bar",hcaes(x = Genre, y = Freq),name = " num of movie of each genre")%>%
hc_title(text = " num of movie of each genre")
#### نمودار همبستگی ژانرها
movie_with_genres[,5:21]->m
m%>%na.omit()->m
res <- cor(m, method = "pearson", use = "complete.obs")
corrplot(res, type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45)
# متوسط امتیاز به هر ژانر
genre_movie_rate%>%na.omit()%>%summarise(Action_=sum(Action*meanRate)/sum(Action),
Adventure_=sum(Adventure*meanRate)/sum(Adventure),
Animation_=sum(Animation*meanRate)/sum(Animation),
Children_=sum(Children*meanRate)/sum(Children),
Comedy_=sum(Comedy*meanRate)/sum(Comedy),
Crime_=sum(Crime*meanRate)/sum(Crime),
Documentary_=sum(Documentary*meanRate)/sum(Documentary),
Drama_=sum(Drama*meanRate)/sum(Drama),
Fantasy_=sum(Fantasy*meanRate)/sum(Fantasy),
Film_Noir_=sum(Film_Noir*meanRate)/sum(Film_Noir),
Horror_=sum(Horror*meanRate)/sum(Horror),
Mystery_=sum(Mystery*meanRate)/sum(Mystery),
Romance_=sum(Romance*meanRate)/sum(Romance),
Sci_Fi_=sum(Sci_Fi*meanRate)/sum(Sci_Fi),
Thriller_=sum(Thriller*meanRate)/sum(Thriller),
War_=sum(War*meanRate)/sum(War),
Western_=sum(Western*meanRate)/sum(Western))->ccc
rate=as.numeric(ccc[1,])
c4=data.frame(genreName=genreName,rate=rate)
c4%>%arrange(rate)%>%hchart(type = "bar",hcaes(x = genreName, y =rate,name =genreName,group=genreName))%>%
hc_title(text = "mean rate of genres")
# دوران طلایی فیلم سازی
right_join(popularity,movie,by="MovieID")%>%group_by(year)%>%
summarise(meanRateYear=mean(meanRate))->goldenAge
goldenAge=goldenAge[-c(1:10),]
goldenAge%>%na.omit()%>%arrange(-meanRateYear)->goldenAge
head(goldenAge,n=3)
## # A tibble: 3 x 2
## year meanRateYear
## <dbl> <dbl>
## 1 1924 3.93
## 2 1916 3.79
## 3 1931 3.71
movie%>%mutate(pureTitle=str_sub(Title,1,-7))%>%select(pureTitle)->movieTitle
movieTitle%>%str_replace_all("[[:punct:]]"," ") %>%
str_split(pattern = "\\s") %>%
unlist() %>%
str_to_lower() %>%
removeWords(., stopwords('en')) %>%
removeWords(., stopwords('fr')) %>%
str_trim() %>%
table() %>%
as.data.frame(stringsAsFactors = F)->title_word
colnames(title_word) = c("word","count")
title_word = title_word %>% arrange(desc(count)) %>% filter(count>5)
title_word=title_word[-c(1:4,9,11,35,44,64,75,102),]
wordcloud(title_word$word,title_word$count,
c(5,.3), random.order = FALSE, colors=brewer.pal(8, "Dark2"))
# Q4
rating%>%filter(Rating>3)%>%select(MovieID,UserID)%>%
group_by(UserID)%>%summarise(basket=concatenate(MovieID,collapse = ","))->User_Basket
basket = lapply(User_Basket$basket,FUN = function(x) strsplit(x,split = ",")[[1]])
grules = apriori(basket, parameter = list(support = 0.009,
confidence = 0.25, minlen = 2))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.25 0.1 1 none FALSE TRUE 5 0.009 2
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 628
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[10472 item(s), 69816 transaction(s)] done [0.78s].
## sorting and recoding items ... [1750 item(s)] done [0.13s].
## creating transaction tree ... done [0.06s].
## checking subsets of size 1 2 3 done [18.13s].
## writing ... [11251266 rule(s)] done [0.95s].
## creating S4 object ... done [2.29s].
inspect(grules[1:10])
## lhs rhs support confidence lift count
## [1] {140} => {62} 0.009381804 0.5165615 4.795141 655
## [2] {135} => {780} 0.009367480 0.6318841 3.329229 654
## [3] {257} => {161} 0.009410450 0.6021998 4.542263 657
## [4] {257} => {349} 0.009840151 0.6296975 4.026650 687
## [5] {257} => {150} 0.010112295 0.6471127 2.315557 706
## [6] {257} => {47} 0.009195600 0.5884510 2.315465 642
## [7] {257} => {457} 0.011573278 0.7406049 2.278202 808
## [8] {257} => {110} 0.010255529 0.6562786 1.999335 716
## [9] {257} => {356} 0.009797181 0.6269478 1.665246 684
## [10] {257} => {318} 0.011272488 0.7213566 1.769704 787
#Castle in the Sky (1986) 6350
#Cast Away (2000) 4022
#No Country for Old Men (2007) 51372
#Memento (2000) 4226
movierules = subset(grules, lhs %pin% c("6350","4022","51372","4226"))
inspect(sort(movierules, by = "lift")[1:32])->mn
## lhs rhs support confidence lift count
## [1] {5618,6350} => {3000} 0.010398762 0.8066667 16.433685 726
## [2] {4993,6350} => {3000} 0.009510714 0.7923628 16.142282 664
## [3] {2571,6350} => {3000} 0.009081013 0.7788698 15.867398 634
## [4] {6350} => {3000} 0.011344105 0.7600768 15.484540 792
## [5] {3000,6350} => {5618} 0.010398762 0.9166667 14.917949 726
## [6] {296,6350} => {5618} 0.009195600 0.8991597 14.633038 642
## [7] {6350,7153} => {5618} 0.009911768 0.8975357 14.606608 692
## [8] {5952,6350} => {5618} 0.009840151 0.8910506 14.501069 687
## [9] {4993,6350} => {5618} 0.010670906 0.8890215 14.468047 745
## [10] {1196,6350} => {5618} 0.009281540 0.8888889 14.465890 648
## [11] {2959,6350} => {5618} 0.009066690 0.8853147 14.407723 633
## [12] {2571,6350} => {5618} 0.010255529 0.8796069 14.314833 716
## [13] {260,6350} => {5618} 0.009195600 0.8770492 14.273209 642
## [14] {6350} => {5618} 0.012891028 0.8637236 14.056347 900
## [15] {5952,6350} => {7153} 0.010026355 0.9079118 5.938983 700
## [16] {4993,6350} => {7153} 0.010441733 0.8699284 5.690520 729
## [17] {2571,6350} => {7153} 0.009424774 0.8083538 5.287738 658
## [18] {6350} => {4973} 0.009711241 0.6506718 5.267544 678
## [19] {6350,7153} => {5952} 0.010026355 0.9079118 5.177389 700
## [20] {4993,6350} => {5952} 0.010627936 0.8854415 5.049251 742
## [21] {5618,6350} => {7153} 0.009911768 0.7688889 5.029584 692
## [22] {5952,6350} => {4993} 0.010627936 0.9623865 4.964165 742
## [23] {6350,7153} => {4993} 0.010441733 0.9455253 4.877192 729
## [24] {6350} => {7153} 0.011043314 0.7399232 4.840109 771
## [25] {1196,6350} => {4993} 0.009310187 0.8916324 4.599202 650
## [26] {2571,6350} => {5952} 0.009396127 0.8058968 4.595646 656
## [27] {260,6350} => {4993} 0.009295863 0.8866120 4.573307 649
## [28] {2571,6350} => {4993} 0.010198235 0.8746929 4.511825 712
## [29] {6350} => {3996} 0.009682594 0.6487524 4.380820 676
## [30] {5618,6350} => {5952} 0.009840151 0.7633333 4.352927 687
## [31] {3000,6350} => {4993} 0.009510714 0.8383838 4.324537 664
## [32] {5618,6350} => {4993} 0.010670906 0.8277778 4.269829 745
unique(mn$rhs)
## [1] {3000} {5618} {7153} {4973} {5952} {4993} {3996}
## Levels: {3000} {3996} {4973} {4993} {5618} {5952} {7153}
mn[,c(1,3)]%>%mutate(MovieID=as.numeric(str_sub(rhs,2,-2)))->moviecode
unique(moviecode$MovieID)
## [1] 3000 5618 7153 4973 5952 4993 3996
movie%>%filter(MovieID %in% unique(moviecode$MovieID))%>%.[1:5,]
## # A tibble: 5 x 4
## MovieID Title Genres year
## <dbl> <chr> <chr> <dbl>
## 1 3000 Princess Mononoke (Mononoke-hime… Action|Adventure|Animat… 1997
## 2 3996 Crouching Tiger, Hidden Dragon (… Action|Adventure|Drama|… 2000
## 3 4973 Amelie (Fabuleux destin d'Amélie… Comedy|Romance 2001
## 4 4993 Lord of the Rings <NA> NA
## 5 5618 Spirited Away (Sen to Chihiro no… Adventure|Animation|Chi… 2001
من انتظار داشتم تئوری درس بیشتر باشد. قبول دارم که درس ، برنامه نویسی و کد زنی بود ولی جا داشت بیشتر و عمیق تر مباحث تئوری رو بررسی میکردیم. مثلا در خوشهبندی و PCA میشد عمیق تر شد و برای من جذاب تر هم میبود.
بعضی از دیتاست ها واقعا کسل کننده بودن :))) مثلا دیتاست لالیگا. تهشم نفهمیدم به چه دردی میخورد :)
قسمت های انتهایی درس، مخصوصا تمرین ۱۲ کاش بیشتر میبود. برای مباحثی مثل نقشه خیلی وقت گذاشتیم، ولی این سیستم پیشنهاد دهنده به نظرم مفیدتر و مهم تر بود.
تمرین ها خیلی هاشون گنگ بودن، و همین باعث می شد بیخودی وقت زیادی صرف یک تمرین ساده بشه.
پروژه تا الان هیچ فیدبکی نداشتیم. کاش روی پروپوزال اولیه فید بک میدادین تا الان، و با فاز یک رو نظر میدادین. اصلا نمیدونیم چه حجمی کار لازم داره. تمرین ها هم هیچ فیدبکی نداشتیم، ممکنه آدم تا آخر راه رو اشتباه بره اینجوری :)
ساعت کلاس رو به ۱۰-۱۲ تغییر بدید
روی چند تا تمرین اول قبل عید فیدبک بدین، که بقیه رو درست انجام بدیم.
پروژه رو خوب میشد بعد نوشتن پروپوزال یا حداقل بعد فاز یک، یک تحویل حضوری میداشت که بفهمیم پروژه رو درست انتخاب کردیم اصلا یا نه
مباحث تئوری رو عمیق تر بخونیم توی درس و حتی یک تمرین کاملا تئوری می داشتیم بد نبود
اوایل ترم که مباحث ساده تر هستند رو سریعتر درس بدین تا اواخر ترم مباحث سخت تر و مهم تر رو وقت کافی داشته باشیم.
nonlinear dimensionality reduction, Kernel PCA
linear SVM , and nonlinear SVM
Fuzzy clustring
داده ای پزشکی نداشتیم. دیتاست های فراوانی با علائم بیمار و تشخیص پزشک وجود دارد.
در مباحث مربوط به انرژی دیتا ست های زیادی در باره برق و یا نیروگاه های انرژی های نو وجود دارد. چون اکثر بچه ها دانشجوی مهندسی اند جالب خواهد بود.
برای تشخیص ایمیل و پیامک اسپم چندین دیتا ست در کگل وجود دارد. به عنوان تمرین تحلیل متن این دیتاست ها هم جذابه.
از یک داده میتوان به گونه ای اطلاعات را خارج کرد و یا نشان داد که برداشت فردی که خروجی را میبیند متفاوت از حقیقت باشد
از آزمون فرض در پروژه درس های دیگر هم استفاده کردم.
مدل خطی و مباحث تئوری اش
factor analysis